Section 1. Data: dataset, preprocessing, visualisation, priminarly examination Section 2. machine learning methods: model parameter tunning, interpretation
Required packages
ipak <- function(pkg){
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg))
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
#repos='http://cran.muenster.r-project.org'
stdata = c("sp", "sf", "raster")
Stat_methods = c("lme4", "glmnet", "ranger", "gbm", "xgboost", "party", "caret", "gstat")
visual = c("RColorBrewer", "ggplot2", "corrplot", "tmap" )
map = c("maptools")
tidy = c("devtools", "dplyr", "tidyr", "knitr")
other = c("countrycode", "data.table", "Matrix", "GGally", "pdp")
optional = c("leafem", "vip", "DT", "sparkline","leaflet", "mapview", "htmlwidgets", "rasterVis", "tibble", "shiny") # for the chuncks to be run after work shop or other experienments (other scripts but in this workshop).
packages <- c(stdata, tidy, Stat_methods, visual, map, other)
ipak(packages)
## sp sf raster devtools dplyr tidyr
## TRUE TRUE TRUE TRUE TRUE TRUE
## knitr lme4 glmnet ranger gbm xgboost
## TRUE TRUE TRUE TRUE TRUE TRUE
## party caret gstat RColorBrewer ggplot2 corrplot
## TRUE TRUE TRUE TRUE TRUE TRUE
## tmap maptools countrycode data.table Matrix GGally
## TRUE TRUE TRUE TRUE TRUE TRUE
## pdp
## TRUE
APMtools is an R package providing datasets for global air pollution modelling and tools to streamline and facilitate air pollution mapping using statistical methods. Please have a read of it on Github.
install_github("mengluchu/APMtools")
library(APMtools)
ls("package:APMtools")
## [1] "Brt_imp" "Brt_LUR" "Brt_pre"
## [4] "cforest_LUR" "countrywithppm" "create_ring"
## [7] "ctree_LUR" "DENL_new" "error_matrix"
## [10] "global_annual" "join_by_id" "Lasso"
## [13] "Lasso_pre" "Lassoselected" "mechanical"
## [16] "merge_roads" "merged" "merged_nightlight"
## [19] "mergeraster2file" "plot_error" "plot_rsq"
## [22] "ppm2ug" "predicLA_RF_XGBtiles" "prediction_with_pp_La"
## [25] "ranger_stack" "RDring_coef" "removedips"
## [28] "retrieve_predictor" "rf_imp" "rf_Lasso_LUR"
## [31] "rf_LUR" "rf_pre" "sampledf"
## [34] "scatterplot" "subset_grep" "univar_rsq"
## [37] "xgb_pre" "xgb_stack" "xgboost_imp"
## [40] "xgboost_LUR"
Load data:
#gd = fread("~/Documents/GitHub/Global mapping/2020_06_world/stations_20200602.csv")
#avg = fread("~/Documents/GitHub/Global mapping/oaqEUAUCAUS.csv")
#gdata = merge(gd, avg, by.x = c("long", "lat"), by.y = c("LONGITUDE","LATITUDE" ))
#g1 = na_if(gdata, -9999.9)
#g2 = g1%>%dplyr::select(-id, -dir, -V1)%>%filter(value_mean >0)
data("global_annual")
Predictor variables to include:
vastring = "road|nightlight|population|temp|wind|trop|indu|elev"
Dataset:
1. road_class_XX_size: road lenght within a buffer with radius “size” of type XX. ROAD_1: highway, ROAD_2: primary, ROAD_3: secondary, ROAD_4: tertiary, ROAD_5: unpaved
2. industry_size: Industrial area within a buffer with radius “size”.
3. trop_mean: TROPOMI averaged over Feb 2018 - Jan 2019.
4. temperature_2m_m: monthly mean temperature at 2m height of month “m”.
5. wind_speed_10m_m:monthly mean wind speed at 2m height of month “m”.
6. poppulation_1000/ 3000 /5000: population 1, 3, 5 km resolution.
7. Rsp: Surface remote sensing and chemical transport model product (only for 2012).
8. OMI_mean_filt: OMI column density, 2017 annual average.
9. nightlight_size: nightlight VIIRS data in original resolution (500 m) and various buffer sizes.
NO2 annual concentration, all the units are converted to \(\mu g/m^3\) (micro grams per cubic meter). In the data display later you can see countries with different units.
global_annual %>% dplyr::select(value_mean ) %>% summary()
## value_mean
## Min. : 0.00022
## 1st Qu.:13.85912
## Median :22.48644
## Mean :24.37987
## 3rd Qu.:32.65573
## Max. :98.34452
#datatable(g2, rownames = FALSE, filter = "top", options = list(pageLength = 5, scrollX = T))
Merge roads of different road types, here 3, 4, 5 means the road length of these roads( i.e. road_class_3, road_class_4, and road_class_5) are aggregated. The original road types are substituted (with keep = T, they are remained).
merged_mr = merge_roads(global_annual, c(3, 4, 5), keep = F) # keep = T keeps the original roads.
names(merged_mr)
## [1] "road_class_M345_1000" "road_class_M345_100" "road_class_M345_25"
## [4] "road_class_M345_3000" "road_class_M345_300" "road_class_M345_5000"
## [7] "road_class_M345_500" "road_class_M345_50" "road_class_M345_800"
## [10] "long" "lat" "nightlight_800"
## [13] "nightlight_500" "nightlight_5000" "nightlight_3000"
## [16] "nightlight_1000" "elevation" "industry_1000"
## [19] "industry_100" "industry_25" "industry_3000"
## [22] "industry_300" "industry_5000" "industry_500"
## [25] "industry_50" "industry_800" "OMI_mean_filt"
## [28] "population_1000" "population_3000" "population_5000"
## [31] "road_class_1_1000" "road_class_1_100" "road_class_1_25"
## [34] "road_class_1_3000" "road_class_1_300" "road_class_1_5000"
## [37] "road_class_1_500" "road_class_1_50" "road_class_1_800"
## [40] "road_class_2_1000" "road_class_2_100" "road_class_2_25"
## [43] "road_class_2_3000" "road_class_2_300" "road_class_2_5000"
## [46] "road_class_2_500" "road_class_2_50" "road_class_2_800"
## [49] "Rsp" "temperature_2m_10" "temperature_2m_11"
## [52] "temperature_2m_12" "temperature_2m_1" "temperature_2m_2"
## [55] "temperature_2m_3" "temperature_2m_4" "temperature_2m_5"
## [58] "temperature_2m_6" "temperature_2m_7" "temperature_2m_8"
## [61] "temperature_2m_9" "trop_mean_filt" "wind_speed_10m_10"
## [64] "wind_speed_10m_11" "wind_speed_10m_12" "wind_speed_10m_1"
## [67] "wind_speed_10m_2" "wind_speed_10m_3" "wind_speed_10m_4"
## [70] "wind_speed_10m_5" "wind_speed_10m_6" "wind_speed_10m_7"
## [73] "wind_speed_10m_8" "wind_speed_10m_9" "value_mean"
## [76] "country" "GHI"
Visualization with tmap is much more convenient than overleaf, but with the later you can have more controls of your map. Here uses tmap, with the script below, a map is saved to the same location of your Rmarkdown file.
locations_sf = st_as_sf(merged_mr, coords = c("long","lat"))
osm_valuemean = tm_shape(locations_sf) +
tm_dots( "value_mean", col = "value_mean", size = 0.05,title = "NO2 value",
popup.vars = c("value_mean" )) + tm_view(basemaps = c('OpenStreetMap'))
#+tm_shape(lnd)+tm_lines()
tmap_save(osm_valuemean, "NO2mean.html")
Checking missing data
climatemissing= merged_mr%>%filter(is.na(wind_speed_10m_10))
locations_sf = st_as_sf(climatemissing, coords = c("long","lat"))
osm_valuemean = tm_shape(locations_sf) +
tm_dots( "value_mean", col = "value_mean", size = 0.05,title = "NO2 value",
popup.vars = c("value_mean" )) + tm_view(basemaps = c('OpenStreetMap'))
#+tm_shape(lnd)+tm_lines()
tmap_save(osm_valuemean, "climatemissing.html")
Boxplot
countryname = paste(merged_mr$country, countrycode(merged_mr$country, 'iso2c', 'country.name'), sep = ":")
#tag country with ppm
# use the median for colour
mergedmedian = merged_mr %>% group_by(country) %>% mutate(median = median(value_mean, na.rm = TRUE))
nrow(merged_mr)
## [1] 5521
merged_mr = merged_mr %>% group_by(country) %>% mutate(count1 = n())
countryname_s_e=ifelse( merged_mr$country %in% countrywithppm[countrywithppm %in% merged_mr$country], paste(countryname, "*", sep = ""), countryname)
mergedmedian$countryfullname = paste0(countryname_s_e," (",merged_mr$count1 ,")")
bp2 <- ggplot(mergedmedian, aes(x=countryfullname, y=value_mean, group=country)) +
labs(x = "Country", y = expression(paste("NO"[2], " ", mu, "g/", "m"^3)), cex = 1.5) +
geom_boxplot(aes(fill = median)) +
theme(text = element_text(size = 13), axis.text.x = element_text(angle = 90, hjust = 1)) + scale_fill_distiller(palette = "Spectral")
# scale_color_brewer(direction = 1)
print(bp2 + ylim(0, 100))
#ggsave("boxplot.png")
Plot the paired correlation (pearson correlation coefficient), for road predictors, population, Tropomi. Compare the correlation graphs between Germnany and world. The correlation between NO2 concentration and predictor variables are much lower, calling for a non-linear method to model the hetrogeneities between countries.
merged_mr %>% na.omit %>% filter(country == "DE") %>% ungroup() %>%dplyr::select(matches("value_mean|road_|night|trop")) %>% cor %>% corrplot(type = "upper", method = "pie", tl.cex = 0.7)
#merged_mr %>% na.omit %>% filter(country == "US") %>% ungroup() %>%dplyr::select(matches("value_mean|road_|night|trop")) %>% cor %>% corrplot(type = "upper", method = "pie", tl.cex = 0.7)
#merged_mr %>% na.omit %>% filter(country == "CA") %>% ungroup() %>%dplyr::select(matches("value_mean|road_|night|trop")) %>% cor %>% corrplot(type = "upper", method = "pie", tl.cex = 0.7)
#merged_mr %>% na.omit %>% filter(country == "CN") %>% ungroup() %>%dplyr::select(matches("value_mean|road_|night|trop")) %>% cor %>% corrplot(type = "upper", method = "pie", tl.cex = 0.7)
merged_mr %>% na.omit %>% ungroup() %>%dplyr::select(matches("value_mean|road_|night|trop|pop")) %>% cor %>% corrplot(type = "upper", method = "pie", tl.cex = 0.7)
Inspecting spatial dependency using the variogram, assuming cutoff is 1 degree (111 km at nadir).
grd_sp <- as_Spatial(locations_sf)
dt.vgm = variogram(value_mean~1, grd_sp, cutoff= 1)
plot(dt.vgm)
#Moran I test
#install.packages("ape", dependencies = TRUE)
#library(ape)
#merged_mrf = merged_mr%>%filter(country == "US")
#no2.dists <- as.matrix(dist(cbind(merged_mrf$LONGITUDE, merged_mrf$LATITUDE)))
#no2.dists[1:5, 1:5]
#no2.inv <- 1/no2.dists
#diag(no2.inv) <- 0
#no2.inv[1:5, 1:5]
#Moran.I(merged_mrf$value_mean, na.rm = T, no2.inv)
Inspecting spatial correlation within countries. The function countryvariogram has two variables, @params COUN: the country code @params cutoff: the variogram cutoff in degree.
countryvariogram = function(COUN, cutoff){
loca = locations_sf%>%filter(country == COUN)
grd_sp <- as_Spatial(loca)
dt.vgm = variogram(value_mean~1, grd_sp, cutoff = cutoff)
plot(dt.vgm)
}
You can try different cutoffs and see the changes in variograms.
countryvariogram("DE", 1)
countryvariogram("US", 1)
countryvariogram("CN", 1) # reason?
For the illustration purpose, we simply remove missing data, there are not many, 41. In practice, careful handling is needed to choose between removing missing data, imputation or handle them explicitly in the model.
inde_var = merged_mr %>%na.omit() # 70 variables
The scatter plots between predictors and mean NO2, Germany and global datasets. loess: moving regression, non-parametric, each smoothed value is given by a weighted linear least squares regression over the span.
inde_var %>% filter(country=="DE")%>%ungroup%>% dplyr::select(matches("road_class_M345|nightlight|temperature_2m_7|value")) %>% scatterplot(y_name = "value_mean", fittingmethod = "loess")
inde_var %>% filter(country=="DE")%>%ungroup%>% dplyr::select(matches("road_class_2|value")) %>% scatterplot(y_name = "value_mean", fittingmethod = "gam")
inde_var %>%ungroup%>% dplyr::select(matches("road_class_M345|nightlight|temperature_2m_7|value")) %>% scatterplot(y_name = "value_mean", fittingmethod = "loess")
inde_var %>% ungroup%>% dplyr::select(matches("road_class_2|value")) %>% scatterplot(y_name = "value_mean", fittingmethod = "loess")
inde_var %>% ungroup%>% dplyr::select(matches("road_class_1|value")) %>% scatterplot(y_name = "value_mean", fittingmethod = "gam")
#inde_var %>% dplyr::select(matches("Tro|OMI|Rsp|night_value")) %>% scatterplot(y_name = "night_value", fittingmethod = "loess")
# why?
# can choose any variable to look at the scatterplot
#inde_var %>% dplyr::select(matches("ROAD_1|day_value")) %>% scatterplot(y_name = "day_value", fittingmethod = "gam")
Discussion 1, try different scatterplot and discuss about the findings
If we use a single regression tree, the tree and prediction error will be different if shuffeling training and testing data. To reduce the variance, a method called “bagging” is developed, which agregate over (weak) voters. If the predictor variables are correlated, the reduction in variance is decreasing, Random Forest comes to resecue by ramdomly sampling variables.
XGBoost is surely popular, as it won the Kaggle (machine learning prediction) competition. It has a few features such as being scalable, but most importantly, it super-imposed a regularization to prevent model over-fitting. However, in my experience, though I often obtain the lowest RMSE or highest R2 with XGBoost, the spatial pattern it predicts look a lot worse than random forest, or simpler linear model with regularization – Lasso. It looks to predict lots of artefacts. We further compared various model prediction results with mobile sensor measurements (with a typical Dutch transporting tool “bakfiets”, which is a cargo-bike), and found XGBoost matches the least with the mobile sensor measurements! – No matter how I tune the model.
inde_var = inde_var%>%ungroup()%>%dplyr::select(-country)
Here is an example of the effects of hyperparameters on cross-validation results and prediction patterns of XGboost.
Note
Here I only show how to tune hyper-parameters, detailed description and what hyper-parameter I tunned are in “Glo_hyp_tun.Rmb”
It is commonly (and strongly) recommended in deep learning to split the data into 3: model training; hyperparameter optimization; testing. The reason, is that the hyper-parameter optimization process may cause information leakage. However, separating a test dataset out may cause more severe bias. Actually, this question haunted me for a very long time since I started pulling out my results, and alsogenerated heated discussions between me and a reviewer of my recent global mapping paper. To reflect, [I wrote a discussion about this] (https://tomatofox.wordpress.com/2020/04/20/how-to-assess-accuracy-of-a-machine-learning-method-when-observations-are-limited/), comments appreciated!
Let’s come back to the model tunning, here I am showing to do this with the R package Caret, there are other packages,such as mlr3, but but Caret seems to be well-maintained, and is sufficient in most cases, and you can simply use ggplot on the caret::train object. All we need is to custom a tunning grid for tunning and control the resampling method.
Firstly, Random Forest, the most important hyperparameter are mtry - the number of variables sampled at each split, and min.node.size - the minimum number of observations at the leaf. The number of trees is also a hyperparameter, but it can be set as high as you like, as it will not cause model over-fitting as each tree is grown independently, which is different from boosting, which grows trees subsequently. Of course, you can also tune it.
[Try after the workshop as it takes quite a while; set the “eval =T”]
trl <- trainControl(method = "cv", number = 3) # control the resampling method
tgrid <- expand.grid(
.mtry = seq(7, 30, by = 3),
.splitrule = "variance",
.min.node.size = c(5, 10)
)
caret::train(value_mean ~ ., data = inde_var , method='ranger', trControl = trl, tuneGrid= tgrid) %>%ggplot()
#The final values used for the model were mtry = 25, splitrule = variance and min.node.size = 5.
In the same way, we can tune GBM [ Try after the workshop as it takes quite a while].
Tunning XGBoost is more complex as it has a lot more hyperparameters to tune: https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
1. gamma[default=0][range: (0,Inf)] It controls regularization (or prevents overfitting). The optimal value of gamma depends on the data set and other parameter values. Higher the value, higher the regularization. Regularization means penalizing large coefficients which don’t improve the model’s performance. default = 0 means no regularization. Tune trick: Start with 0 and check CV error rate. If you see train error >> test error, bring gamma into action.
2. lambda and Alpha: similar to the Lasso Lambda, control the strength of regularization
3. nrounds[default=100] It controls the maximum number of iterations. For classification, it is similar to the number of trees to grow. Should be tuned using CV
4. eta[default=0.3][range: (0,1)] It controls the learning rate, i.e., the rate at which our model learns patterns in data. After every round, it shrinks the feature weights to reach the best optimum. Lower eta leads to slower computation. It must be supported by increase in nrounds. Typically, it lies between 0.01 - 0.3
5. max_depth[default=6][range: (0,Inf)] It controls the depth of the tree. Larger data sets require deep trees to learn the rules from data. Should be tuned using CV
xgboostgrid = expand.grid(nrounds = seq(300, 2000, by = 50), max_depth = 3:5, eta = seq(0.05, 0.2, by = 0.05),gamma = 1,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 0.7)
trainControl <- trainControl(method="cv", number=5, savePredictions = "final", allowParallel = T) #5 - folds
# train the model
train(value_mean~., data=inde_var, method="xgbTree", trControl=trainControl, tuneGrid =xgboostgrid)%>%ggplot()
If you train a single train, e can see the tree is stochastic. But we can look at the tree structure to get some intuition of the model structure.
for (i in 2:5)
{
set.seed(i)
ctree_LUR(inde_var, y_varname= c("value_mean"), training = training, test = test, grepstring = vastring)
}
Creates diverse set of trees because 1) trees are unstable w.r.t. changes in learning/training data (bagging) 2) randomly preselect mtry splitting variables in each split
smp_size <- floor(0.8* nrow(inde_var))
training<- sample(seq_len(nrow(inde_var)), size = smp_size)
test = seq_len(nrow(inde_var))[-training]
r1 = ranger(value_mean~., data = inde_var[training,], num.trees = 500)
r1
## Ranger result
##
## Call:
## ranger(value_mean ~ ., data = inde_var[training, ], num.trees = 500)
##
## Type: Regression
## Number of trees: 500
## Sample size: 4260
## Number of independent variables: 76
## Mtry: 8
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 51.4215
## R squared (OOB): 0.7352291
error_matrix(inde_var$value_mean[test], predictions(predict(r1, inde_var[test,])))
## RMSE RRMSE IQR rIQR MAE
## 7.0205606 0.2911164 6.7820274 0.3095243 4.9064112
## rMAE rsq explained_var
## 0.2034505 0.7513529 0.7513072
[Advanced] A step-further: postprocessing using Lasso to combine trees (instead of a simple aggregation). In this way, we can reduce the number of trees, this can result in higher prediction accuracy, reduce model overfitting and result in a more concise model. This method is introduced in the book Element of statistical learning (but is rarely seen in application, most likely because there is not an implemented function) but it is very simple to do, let’s make a new function called prediction_with_pp_la (prediction with postprocessing using Lasso).
#' @param rfmodel ranger model
#' @param trainingXY training matrix, with same response and predictor names as in the rfmodel
#' @param testingX testing matrix, with predictor names in the rfmodel presented
#' @param trainingY training response vection
prediction_with_pp_La=function(rfmodel, trainingXY, trainingY, testingX)
{
allp = predict(r1,trainingXY , predict.all = T)%>%predictions #get all the tree predictions, instead of the mean
cvfit <- glmnet::cv.glmnet(allp,trainingY ,
type.measure = "mse", standardize = TRUE, alpha = 1,
lower.limit = 0)
# aggregate using a regularization, here lasso, you can also do elastic net, training alpha or specify alpha between 0 and 1
print(sum(coef(cvfit)[-1]!= 0))
# we can also plot it, using a tool from APMtools
Lassoselected(cvfit)
rpre= predict(r1,testingX, predict.all=T)%>%predictions # get all the tree predictions
predict(cvfit, newx = rpre) # use the regularization (here lasso) model to predict
}
pre = prediction_with_pp_La(rfmodel = r1, trainingX = inde_var[training,], trainingY =inde_var$value_mean[training], testingX = inde_var[test,])
## [1] 145
We got a small improvement in prediction accuracy, both interms of bias and variance, and we reduced the number of trees to grow! You can even see the distribution of the weights of trees.
error_matrix(inde_var$value_mean[test], pre)
## RMSE RRMSE IQR rIQR MAE
## 6.9257598 0.2871853 6.8404510 0.3121907 4.7967183
## rMAE rsq explained_var
## 0.1989019 0.7580226 0.7578587
# number of trees selected, from 500 to 155!
Here I am showing the “gbm” package. The “dismo” package extends “gbm” with the deviance calculated from a distribution that can be chosen by users. Note though the dismo is built on gbm functions, the hyperparameters are slightly different. Use 800 trees for faster calculation. In practice this will be increased.
gbm1 = gbm(formula = value_mean~., data =inde_var, distribution = "gaussian", n.trees = 800,
interaction.depth = 6, shrinkage = 0.01, bag.fraction = 0.5 )
gbm1
## gbm(formula = value_mean ~ ., distribution = "gaussian", data = inde_var,
## n.trees = 800, interaction.depth = 6, shrinkage = 0.01, bag.fraction = 0.5)
## A gradient boosted model with gaussian loss function.
## 800 iterations were performed.
## There were 76 predictors of which 73 had non-zero influence.
summary(gbm1)
## var rel.inf
## trop_mean_filt trop_mean_filt 36.20521689
## population_5000 population_5000 14.88884541
## population_3000 population_3000 9.55653597
## population_1000 population_1000 6.26321387
## long long 2.33392925
## road_class_2_25 road_class_2_25 2.14368922
## road_class_2_50 road_class_2_50 1.81358263
## wind_speed_10m_3 wind_speed_10m_3 1.44784581
## road_class_2_100 road_class_2_100 1.39889847
## count1 count1 1.37919640
## road_class_1_50 road_class_1_50 1.21964225
## road_class_1_25 road_class_1_25 1.19819906
## nightlight_500 nightlight_500 1.05139626
## road_class_M345_3000 road_class_M345_3000 0.89709314
## road_class_M345_100 road_class_M345_100 0.79960191
## road_class_1_100 road_class_1_100 0.79867158
## road_class_1_300 road_class_1_300 0.75633647
## road_class_M345_25 road_class_M345_25 0.75626870
## OMI_mean_filt OMI_mean_filt 0.74596543
## wind_speed_10m_2 wind_speed_10m_2 0.71147052
## road_class_M345_50 road_class_M345_50 0.66879919
## road_class_M345_300 road_class_M345_300 0.65731654
## road_class_M345_1000 road_class_M345_1000 0.63391895
## road_class_2_3000 road_class_2_3000 0.63296788
## road_class_1_5000 road_class_1_5000 0.58539927
## road_class_2_5000 road_class_2_5000 0.52204851
## road_class_1_500 road_class_1_500 0.46868110
## lat lat 0.45660721
## temperature_2m_10 temperature_2m_10 0.45549149
## elevation elevation 0.44138587
## wind_speed_10m_1 wind_speed_10m_1 0.42652047
## Rsp Rsp 0.41697827
## wind_speed_10m_12 wind_speed_10m_12 0.40825407
## road_class_M345_500 road_class_M345_500 0.36753111
## road_class_M345_800 road_class_M345_800 0.35710773
## road_class_M345_5000 road_class_M345_5000 0.35265864
## temperature_2m_1 temperature_2m_1 0.35163371
## road_class_1_3000 road_class_1_3000 0.31765671
## nightlight_5000 nightlight_5000 0.30872535
## temperature_2m_12 temperature_2m_12 0.25377727
## industry_5000 industry_5000 0.25237947
## temperature_2m_6 temperature_2m_6 0.24907660
## temperature_2m_5 temperature_2m_5 0.24754579
## road_class_2_300 road_class_2_300 0.24067159
## GHI GHI 0.23275551
## temperature_2m_7 temperature_2m_7 0.21515600
## wind_speed_10m_11 wind_speed_10m_11 0.21041249
## road_class_1_800 road_class_1_800 0.20581396
## wind_speed_10m_4 wind_speed_10m_4 0.19811139
## temperature_2m_2 temperature_2m_2 0.19590139
## temperature_2m_4 temperature_2m_4 0.19152076
## wind_speed_10m_5 wind_speed_10m_5 0.18598401
## nightlight_1000 nightlight_1000 0.17058450
## wind_speed_10m_6 wind_speed_10m_6 0.16725517
## temperature_2m_3 temperature_2m_3 0.15659807
## nightlight_3000 nightlight_3000 0.15476681
## industry_3000 industry_3000 0.14952886
## temperature_2m_9 temperature_2m_9 0.13514516
## wind_speed_10m_10 wind_speed_10m_10 0.11660467
## road_class_1_1000 road_class_1_1000 0.09525089
## wind_speed_10m_8 wind_speed_10m_8 0.09151512
## nightlight_800 nightlight_800 0.09115232
## road_class_2_500 road_class_2_500 0.08812765
## temperature_2m_8 temperature_2m_8 0.08578914
## temperature_2m_11 temperature_2m_11 0.08269779
## road_class_2_1000 road_class_2_1000 0.08180108
## wind_speed_10m_7 wind_speed_10m_7 0.07707861
## wind_speed_10m_9 wind_speed_10m_9 0.07154226
## industry_1000 industry_1000 0.03709572
## road_class_2_800 road_class_2_800 0.02639899
## industry_300 industry_300 0.02520810
## industry_800 industry_800 0.01525969
## industry_100 industry_100 0.00621187
## industry_25 industry_25 0.00000000
## industry_500 industry_500 0.00000000
## industry_50 industry_50 0.00000000
plot(gbm1, i.var = 2:3)
#plot(gbm1, i.var = 1)
#rf_residual <- pre_rf - rdf_test$NO2
y_varname= "value_mean"
x = inde_var%>%dplyr::select(-value_mean)
df_x = data.table(inde_var, keep.rownames = F)
df_y = inde_var$value_mean
formu = as.formula(paste(y_varname, "~.", sep = ""))
dfmatrix_x = sparse.model.matrix(formu, data = df_x)[, -1]
train_b = xgb.DMatrix(data = dfmatrix_x, label = df_y)
max_depth = 4
eta = 0.01
nthread = 4
nrounds = 800
Gamma = 2
bst <- xgboost(data = train_b, max_depth = max_depth, eta = eta, nthread = nthread, nrounds = nrounds, Gamma = Gamma, verbose = 1, print_every_n = 200, early_stopping_rounds = 50, maximize = F )
## [13:11:28] WARNING: amalgamation/../src/learner.cc:480:
## Parameters: { Gamma } might not be used.
##
## This may not be accurate due to some parameters are only used in language bindings but
## passed down to XGBoost core. Or some parameters are not used but slip through this
## verification. Please open an issue if you find above cases.
##
##
## [1] train-rmse:27.681536
## Will train until train_rmse hasn't improved in 50 rounds.
##
## [201] train-rmse:8.145431
## [401] train-rmse:6.289529
## [601] train-rmse:5.831811
## [800] train-rmse:5.605877
caretEnsemble is computational intensive. Can customize using the “emsemble.R”
#install.packages("caretEnsemble")
#library(caretEnsemble)
# Stacking Algorithms - Run multiple algos in one call.
#trainControl <- trainControl(method = "repeatedcv",
# number = 10,
# repeats = 2,
# savePredictions = TRUE,
# classProbs = TRUE)
#algorithmList <- c('rf', 'adaboost', 'earth', 'xgbDART', 'svmRadial')
#set.seed(100)
#models <- caretList(value_mean ~ ., data = inde_var , trControl = trainControl, methodList = algorithmList)
#results <- resamples(models)
#summary(results)
Variable importance and partial dependence plots are commonly used to interpret models.
pre_mat = subset_grep(inde_var , grepstring = paste0("value_mean|",vastring))
rf = ranger(value_mean~ ., data = na.omit( pre_mat), mtry = 25, num.trees = 1000, importance = "permutation")
rf
## Ranger result
##
## Call:
## ranger(value_mean ~ ., data = na.omit(pre_mat), mtry = 25, num.trees = 1000, importance = "permutation")
##
## Type: Regression
## Number of trees: 1000
## Sample size: 5326
## Number of independent variables: 70
## Mtry: 25
## Target node size: 5
## Variable importance mode: permutation
## Splitrule: variance
## OOB prediction error (MSE): 50.26291
## R squared (OOB): 0.7423215
# ranger method
sort(importance(rf), decreasing = T)
## trop_mean_filt population_5000 population_3000
## 87.196371344 33.319114721 24.633736474
## population_1000 wind_speed_10m_3 wind_speed_10m_2
## 17.126236031 11.539447550 10.933172722
## road_class_2_5000 wind_speed_10m_1 temperature_2m_5
## 9.599269820 4.976851131 4.731808422
## temperature_2m_7 temperature_2m_2 road_class_2_50
## 4.524979943 4.449618247 4.434140554
## wind_speed_10m_5 nightlight_5000 wind_speed_10m_9
## 4.432101244 4.116411979 3.715958237
## wind_speed_10m_8 road_class_2_25 temperature_2m_1
## 3.713153296 3.711094215 3.532346867
## temperature_2m_12 wind_speed_10m_4 road_class_1_5000
## 3.526726555 3.486641409 3.436250833
## temperature_2m_10 temperature_2m_6 temperature_2m_9
## 3.345509925 3.188151892 3.174570839
## wind_speed_10m_10 wind_speed_10m_12 wind_speed_10m_6
## 3.091389636 3.034517225 3.013540085
## nightlight_3000 road_class_2_3000 nightlight_800
## 3.000774549 2.999191707 2.973348601
## road_class_1_3000 nightlight_500 road_class_2_100
## 2.894829712 2.891141888 2.801783743
## temperature_2m_4 wind_speed_10m_11 temperature_2m_8
## 2.800313260 2.788977848 2.516188199
## temperature_2m_11 temperature_2m_3 road_class_M345_5000
## 2.500815920 2.448387441 2.435285211
## nightlight_1000 road_class_M345_3000 road_class_1_50
## 2.341835125 2.060459655 1.892885638
## road_class_M345_300 road_class_M345_100 wind_speed_10m_7
## 1.830864541 1.828406245 1.633014773
## elevation road_class_M345_50 road_class_1_25
## 1.534676855 1.439083048 1.382546872
## road_class_1_100 road_class_M345_1000 road_class_M345_500
## 1.283173699 1.271894989 1.197489627
## road_class_M345_25 road_class_M345_800 road_class_1_300
## 1.193402931 1.181633204 1.152755658
## industry_5000 road_class_1_500 road_class_2_300
## 1.113834636 1.112668723 0.964353489
## industry_3000 road_class_2_500 road_class_1_800
## 0.926751736 0.813381654 0.741938918
## road_class_2_1000 road_class_2_800 road_class_1_1000
## 0.659183925 0.605040255 0.547251968
## industry_1000 industry_800 industry_300
## 0.284753881 0.157210091 0.038823512
## industry_500 industry_100 industry_25
## 0.026901786 0.012181598 0.004743725
## industry_50
## 0.001226962
[Try after workshop as it takes a while: Variable importance and partial dependence plots can be calculated and presented using vi and sparklines packages, which include more matrices and presentation functionalities. ]
set.seed(2)
vip::list_metrics()
DF_P_r2 = vi(rf, method = "permute", target = "value_mean", metric = "r2" ) # very clear what method is used and what is the target
DF_P_rmse = vi(rf, method = "permute", target = "value_mean", metric = "rmse")
vip (DF_P_r2)
vip (DF_P_rmse)
a=add_sparklines(DF_P_r2, fit = rf)
saveWidget(a, file="sparkline.html")
Use a subset of variables in RF to inspect partial denpendence.
pre_mat_s = inde_var%>%na.omit%>%ungroup() %>% dplyr::select(value_mean, road_class_2_50, nightlight_500, population_5000, road_class_M345_1000)
lm_s = lm(value_mean~., data = pre_mat_s)
rf_s = ranger(value_mean~ ., data = pre_mat_s, num.trees = 1000, importance = "permutation")
rf_s
## Ranger result
##
## Call:
## ranger(value_mean ~ ., data = pre_mat_s, num.trees = 1000, importance = "permutation")
##
## Type: Regression
## Number of trees: 1000
## Sample size: 5326
## Number of independent variables: 4
## Mtry: 2
## Target node size: 5
## Variable importance mode: permutation
## Splitrule: variance
## OOB prediction error (MSE): 100.9394
## R squared (OOB): 0.482523
Partial dependence is the marginal effect one or two features have on the predicted outcome of a machine learning model. It is the integration of all the other predictors given each value of the variable under inspection. Partial dependence plot is calculated based on the assumption that the correlation between the variable to be inspected and variables to be integrated to be low. For example, if we are interested in the effect of population within 1000 m buffer, but we integrate over population within 3000 m buffer, then high population region as is shown with the 1000 m buffer data may include very low popolation within 3000 m buffer, which is very unlikely.
pre_mat_predictor = pre_mat_s%>%dplyr::select(-value_mean)
ggpairs(pre_mat_predictor)
The partial dependence plots of linear and random forest regression
p_lm = partial(lm_s, "road_class_M345_1000",plot = TRUE, rug = TRUE)
plot(p_lm) # Linear regression
p2 = partial(rf_s, "road_class_M345_1000",plot = TRUE, rug = TRUE)
plot(p2) # random forest
We can also inspect 2D and 3-D PDP plots for more in-depth insights of the joint effects from variables. [Try afterwork shop as it will take a while]
#slow
pd <- partial(rf_s, pred.var = c("population_3000", "road_class_M345_1000" ))
# Default PDP
pd1 = plotPartial(pd)
# Add contour lines and use a different color palette
rwb <- colorRampPalette(c("red", "white", "blue"))
pdp2 = plotPartial(pd, contour = TRUE, col.regions = rwb)
#3-D surface
pdp3 <- plotPartial(pd, levelplot = F, zlab = "road_class_2_50", colorkey = T,
screen = list(z = -20, x = -60) )
p3 = partial(rf_s, "road_class_2_50", plot = TRUE, rug = TRUE)
p1 = partial(rf_s, "population_3000", plot = TRUE, rug = TRUE)
grid.arrange(p1, p2, p3, pd1, pdp2, ncol = 2)
Though we used multiple “background variables” (e.g. climate, population) trying to discover hirarchical relationships, the models introduced above does not explicitly model the hirarchical relationships (e.g road effects within a country). For example, the NO2-road density relationships may be heterogenous between countries due to different emission regulations, fuel standards (e.g. Brazil use biof-fuels), as well as the sampling scheme (the geographically distribution) of air monitoring stations. Mixed-effect models (ME) have been developed specificaly to model hirarchical relationships – data are clustred within higher level clusters.
ME can be seen as a special case of the hierarchical Bayesian model, which assumes the random effect coefficients being drawn from some unknown distribution.
The variables are scaled (mean 0, std 1) so it will be easy for the model to find relationships.
set.seed(3)
merged_mr = merged_mr%>%drop_na()
smp_size <- floor(0.8* nrow(merged_mr))
training<- sample(seq_len(nrow(merged_mr)), size = smp_size)
test = seq_len(nrow(merged_mr))[-training]
mpre =subset_grep(merged_mr, vastring)
scaled1 = data.frame(apply(mpre, 2, scale))
scaled1$value_mean = merged_mr$value_mean
Let’s get some categorical variables for exploration. One is country, the other are two categorical variables constructed from continuous variabels TROPOMI and Night Light.
scaled1$country = merged_mr$country
scaled1$tropofac = as.factor(cut(merged_mr$trop_mean_filt, seq(min(merged_mr$trop_mean_filt), max(merged_mr$trop_mean_filt), 100), label = c(1:(length(seq(min(merged_mr$trop_mean_filt), max(merged_mr$trop_mean_filt), 100))-1))))
sep = c( -0.001, 10, 30, 60, max(merged_mr$nightlight_5000))
scaled1$nifac = as.factor(cut(merged_mr$nightlight_5000,sep, label = c(1:(length(sep)-1))))
The two figures below shows the relationship between NO2 and close-by primary road density, as well as nightlight is quite different between countries (first figure). The same applies for the relationships between station measured NO2 concentration and Tropomi NO2 column density.
scaled2 = scaled1 %>% filter(country %in% c("CN","ES","US","DE"))%>%
ggplot( aes(x =road_class_2_50, y = value_mean, color = nifac)) +
geom_point() + facet_wrap( ~ country)+ylab("NO2")+ labs(color='Night Light (category)')
suppressWarnings(print(scaled2))
#ggsave("r50ni_4cou.png")
scaled2 = scaled1 %>% filter(country %in% c("CN","ES","US","DE"))%>%
ggplot( aes(x = trop_mean_filt, y = value_mean)) +
geom_point() + facet_wrap( ~ country)+
geom_smooth(method = "loess") +ylab("NO2")
suppressWarnings(print(scaled2))
# ggsave("pop4cou.png")
Hirarchical models, or mixed-effect models, are used for “clustered data”. This suits our purpose well to model variations within and between regions. It may also be useful to apply to it to spatiotemporal data – try to see if it is better than just using time (e.g. day of year, day of week, year) as predictor variables.
Let’s start from comparing between Mixed-effect model vs. Linear model with categories as variables. The mixed-effect model with intersection being the random effect is similar to standard linear regression model with each category as a variable.
#select a few important variables
scaled_sel = scaled1%>%select(population_5000 ,road_class_M345_1000 , road_class_M345_50 , road_class_2_100 ,road_class_2_25 , trop_mean_filt , nightlight_500 ,road_class_2_50, country, value_mean)
scaledtrain = scaled_sel[training,]
scaledtest = scaled_sel[test,]
#lme model vs. lm
me = lmer(value_mean ~ population_5000 +road_class_M345_1000 + road_class_M345_50 + road_class_2_100 + road_class_2_25 + trop_mean_filt + nightlight_500 +road_class_2_50+(1| country), scaledtrain)
fixe = lm(value_mean ~ population_5000 +road_class_M345_1000 + road_class_M345_50 + road_class_2_100 + road_class_2_25 + trop_mean_filt + nightlight_500 +road_class_2_50+country, scaledtrain)
pre_me1= predict(me, scaledtest)
pre_fixe= predict(fixe, scaledtest)
error_matrix(scaledtest$value_mean,prediction = pre_me1) # mixed effect
## RMSE RRMSE IQR rIQR MAE
## 7.9379052 0.3163196 8.6785873 0.3690845 5.7584960
## rMAE rsq explained_var
## 0.2294718 0.6792423 0.6810828
error_matrix(scaledtest$value_mean,prediction = pre_fixe)
## RMSE RRMSE IQR rIQR MAE
## 7.9580977 0.3171243 8.6963534 0.3698401 5.7782847
## rMAE rsq explained_var
## 0.2302603 0.6776083 0.6795116
This time we let the road density within a 50 meter buffer be the random variable. We see a small increase in accuracy.
me2 = lmer(value_mean ~ population_5000 +road_class_M345_1000 + road_class_M345_50 + road_class_2_100 + road_class_2_25 + trop_mean_filt + nightlight_500 +road_class_2_50+(road_class_2_50| country), scaledtrain)
me2
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## value_mean ~ population_5000 + road_class_M345_1000 + road_class_M345_50 +
## road_class_2_100 + road_class_2_25 + trop_mean_filt + nightlight_500 +
## road_class_2_50 + (road_class_2_50 | country)
## Data: scaledtrain
## REML criterion at convergence: 29967.84
## Random effects:
## Groups Name Std.Dev. Corr
## country (Intercept) 4.830
## road_class_2_50 1.663 -0.48
## Residual 8.003
## Number of obs: 4260, groups: country, 50
## Fixed Effects:
## (Intercept) population_5000 road_class_M345_1000
## 23.69309 0.95824 1.62563
## road_class_M345_50 road_class_2_100 road_class_2_25
## 1.13633 1.44624 1.22720
## trop_mean_filt nightlight_500 road_class_2_50
## 7.34744 2.87043 0.02878
pre_me2= predict(me2, scaledtest)
error_matrix(scaledtest$value_mean,prediction = pre_me2) # mixed effect
## RMSE RRMSE IQR rIQR MAE
## 7.9352165 0.3162125 8.7080808 0.3703388 5.7413048
## rMAE rsq explained_var
## 0.2287867 0.6794595 0.6809082
#rfpre= predict(ranger(value_mean~., data =scaled_sel),scaledtest)%>%predictions()
#error_matrix(scaledtest$value_mean,prediction = rfpre)
We don’t have time to cover everything in one lecture, but mixed-effect models are of promising for large-scale regression and for analyzing spatiotemporal data. Here is something more interesting:
the application of mixed-effect random forest in Jupyter notebook in Python